library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ny_house_data <- read_csv("/Users/cyn_chen/Desktop/Group_C_NYCHouse/Data/NY-House-Dataset.csv")
## Rows: 4801 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): BROKERTITLE, TYPE, ADDRESS, STATE, MAIN_ADDRESS, ADMINISTRATIVE_AR...
## dbl (6): PRICE, BEDS, BATH, PROPERTYSQFT, LATITUDE, LONGITUDE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# For simplicity, remove rows with missing values
ny_house_data <- ny_house_data %>% na.omit()
# Convert relevant columns to numeric
ny_house_data$price <- as.numeric(ny_house_data$PRICE)
ny_house_data$beds <- as.numeric(ny_house_data$BEDS)
ny_house_data$bath <- as.numeric(ny_house_data$BATH)
ny_house_data$propertysqft <- as.numeric(ny_house_data$PROPERTYSQFT)
# Calculate the first quartile (Q1), third quartile (Q3), and IQR
Q1 <- quantile(ny_house_data$PRICE, 0.25)
Q3 <- quantile(ny_house_data$PRICE, 0.75)
IQR <- Q3 - Q1
# Define the lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Filter out rows with `PRICE` values outside of the bounds
ny_house_data <- ny_house_data %>%
filter(PRICE >= lower_bound & PRICE <= upper_bound)
#Extracting zip code from column STATE, then store in new column ZIP_CODE
library(stringr)
ny_house_data$ZIP_CODE <- str_extract(ny_house_data$STATE, "\\d{5}$")
#Add a new column, price per sqft
ny_house_data$PRICE_PER_SQFT<-ny_house_data$PRICE/ny_house_data$PROPERTYSQFT
# Group the data frame by ZIP_CODE and calculate the median price
median_prices_per_sqft <- ny_house_data %>%
group_by(ZIP_CODE) %>%
summarize(median_price = median(PRICE/PROPERTYSQFT, na.rm = TRUE))
median_prices_per_sqft$ZIP_CODE<-as.integer(median_prices_per_sqft$ZIP_CODE)
#Import Zip Code Boundaries file for New York City
ny_zipcode_shape<-read.csv("/Users/cyn_chen/Desktop/Group_C_NYCHouse/Data/Modified_Zip_Code_Tabulation_Areas__MODZCTA_.csv")
zip_codes_with_prices <- left_join(ny_zipcode_shape,median_prices_per_sqft,by=c("MODZCTA"="ZIP_CODE"))
# Remove rows with missing values
zip_codes_with_prices <- zip_codes_with_prices[complete.cases(zip_codes_with_prices), ]
# Drop unnecessary columns by name
columns_to_drop <- c("label", "ZCTA","pop_est")
zip_codes_with_prices <- zip_codes_with_prices[, !(names(zip_codes_with_prices) %in% columns_to_drop)]
zip_codes_with_prices$MODZCTA<-as.numeric(zip_codes_with_prices$MODZCTA)
ny_house_data_map<-data.frame(
FORMATTED_ADDRESS=ny_house_data$FORMATTED_ADDRESS,
TYPE=ny_house_data$TYPE,
TOTAL_PRICE=ny_house_data$PRICE,
PRICE_PER_SQFT=ny_house_data$PRICE_PER_SQFT,
LONGITUDE=ny_house_data$LONGITUDE,
LATITUDE=ny_house_data$LATITUDE
)
library(leaflet)
# Create the Leaflet map
p1 <- leaflet(ny_house_data_map) %>%
addTiles() %>%
addCircleMarkers(lng = ~ny_house_data_map$LONGITUDE, lat = ~ny_house_data_map$LATITUDE,
popup = ~paste("<b>Address:</b>", ny_house_data$FORMATTED_ADDRESS,"<br>",
"<b>Establishment Type:</b> ", ny_house_data$TYPE,"<br>",
"<b>Price per sqft:</b> ",PRICE_PER_SQFT=ny_house_data$PRICE_PER_SQFT),
clusterOptions = markerClusterOptions())
p1
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(viridis)
## Loading required package: viridisLite
# Convert the character strings to spatial objects
zip_codes_with_prices <- st_as_sf(zip_codes_with_prices, wkt = "the_geom")
# Create a color palette based on the average prices
pal <- colorNumeric(palette = "viridis", domain = zip_codes_with_prices$median_price)
# Create a leaflet map centered over a specific location
leaflet_map <- leaflet(data = zip_codes_with_prices) %>%
addTiles() %>%
addPolygons(
fillColor = ~pal(median_price), # Color by median_price
fillOpacity = 0.7,
color = "black", # Border color
weight = 1, # Border thickness
popup = ~paste("Zip Code:", MODZCTA, "<br>Median Price:", median_price), # Add popup info
highlight = TRUE # Highlight on hover
) %>%
addLegend(
pal = pal,
values = zip_codes_with_prices$median_price,
title = "Median Price Per Sqft",
position = "bottomright" # Legend position
)
# Display the map
leaflet_map